/*==============================================================================
FILE NAME: Figure_D1.do
CREATED: 6 July 2025
==============================================================================*/
set more off
clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
	global rootdir "" // insert root path
	global processed_data "$rootdir/processed_data" 
	global raw_data "$rootdir/raw_data"
	global figures "$rootdir/output/figures" // Define global paths for replication package
} 
*/

** Figure D1
//Panel Ad
//Socio-Demographic and Economic Predictors of Citizen Complaints for Air
infile using "$raw_data/Zipcode Decennial Census Demographics/R12907124.dct", using("$raw_data/Zipcode Decennial Census Demographics/R12907124_SL860.txt") //import raw data

rename ZCTA5 ZipCode
destring ZipCode, replace force

drop if ZipCode < 73301 
drop if ZipCode < 75001 & ZipCode != 73301
drop if ZipCode > 88589
drop if ZipCode < 88510 & ZipCode > 79999 & ZipCode != .

rename T001_001 tot_pop // rename 
rename T002_002 urban_pop
rename T002_005 rural_pop
rename T003_001 pop_density
gen urban_share = urban_pop/tot_pop

gen hisp_share = T015_010/T015_001 
gen white_share = T015_003/T015_001
gen black_share = T015_004/T015_001
egen other = rsum(T015_005 T015_006 T015_007 T015_008 T015_009)
gen other_share = other/T015_001

rename T026_001 avg_hh_size

gen less_HS_share = T040_002/T040_001 
egen college_pop = rsum(T040_004 T040_005 T040_006 T040_007 T040_008) 
gen college_share = college_pop/T040_001

gen LFPR = T069_002/T069_001 

gen unemployment_rate = T069_006/T069_002

rename T093_001 median_hh_inc

rename T160_001 median_year_built

rename T163_001 median_hh_value

rename T167_001 median_rent
rename T168_001 median_rent_share

gen child_poverty_rate = T180_002/T180_001 

egen adult_pop = rsum(T181_001 T182_001)
egen adult_pov = rsum(T181_002 T182_002)
gen adult_pov_rate = adult_pov/adult_pop

rename T218_001 avg_commute_time

keep ZipCode tot_pop urban_share urban_pop rural_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share avg_commute_time hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate

save "$processed_data/zipcode_demographics_new.dta", replace //has data for 2000 zipcode demographics

use "$processed_data/incidents_clean.dta", replace

keep RN CIN IncidentDescription IncidentStatus Number_Of_Complaints incident_air incident_water IncidentNature IncidentEffect incident_waste InvestigationStartDate County_CIN ZipCode_CIN ZipFlag_CIN year

duplicates drop

egen ID = group(RN CIN)

sort ID

bys ID: egen count = count(ID)

duplicates drop ID, force

drop ID 

egen ID = group(RN CIN)

//Create unique complaints
save "$processed_data/unique_complaints.dta", replace


// Panel A: Socio-Demographic and Economic Predictors of Citizen Complaint For Water
use "$processed_data/Emissions_events_clean.dta",clear

drop if year < 2003
drop if year > 2019

collapse (sum) EmissionsEvent, by(ZipCode_EE year)
collapse (mean) EmissionsEvent, by(ZipCode_EE)

rename ZipCode_EE ZipCode

save "$processed_data/Emissions_events_count.dta",replace

use "$processed_data/NSR_Permits_Clean.dta", clear
keep RN ZipCode
duplicates drop
save "$processed_data/NSR_RN_ids.dta", replace
use "$processed_data/TitleV_Permits_Clean.dta", clear
keep RN 
duplicates drop
save "$processed_data/TitleV_RN_ids.dta", replace
append using "$processed_data/NSR_RN_ids.dta"
duplicates drop
duplicates drop RN, force
save "$processed_data/TitleV_NSR_RN_ids.dta", replace

use "$processed_data/facility_characteristics_clean.dta",clear

merge m:1 RN using "$processed_data/TitleV_NSR_RN_ids.dta"
keep if _m == 3
drop _m 
keep ZipCode RN
duplicates drop

bys ZipCode: egen count_RN = count(RN)

collapse (mean) count_RN, by(ZipCode)

save "$processed_data/facility_count.dta",replace

use "$processed_data/facility_characteristics_clean.dta",clear

merge m:1 RN using "$processed_data/TitleV_RN_ids.dta"
keep if _m == 3
drop _m 
keep ZipCode RN
duplicates drop

bys ZipCode: egen count_RN = count(RN)

collapse (mean) count_RN, by(ZipCode)

save "$processed_data/TitleV_facility_count.dta",replace


import delimited "$raw_data/BorderCountiesUpdated.csv", encoding(ISO-8859-1)clear
rename county County
gen border_100km = 1
rename basic border_exact
save "$processed_data/Border_Counties.dta", replace

use "$processed_data/unique_complaints.dta", clear

drop if year < 2003
drop if year > 2019

rename ZipCode_CIN ZipCode
rename County_CIN County
gen complaint = Number_Of_Complaints

drop if incident_air == 0
keep if incident_air != .

merge m:1 County using "$processed_data/Border_Counties.dta" 
keep if _m != 2
drop _m 

replace border_exact = 0 if border_exact == .
replace border_100km = 0 if border_100km == .

collapse (sum) complaint (mean) border_exact border_100km , by(ZipCode year)

collapse (mean) complaint border_exact border_100km, by(ZipCode) 

replace border_100km = 1 if border_100km >= 0.5
replace border_100km = 0 if border_100km < 0.5
replace border_exact = 1 if border_exact >= 0.5
replace border_exact = 0 if border_exact < 0.5

merge 1:1 ZipCode using "$processed_data/zipcode_demographics_new.dta"
keep if _m != 1
drop _m

merge 1:1 ZipCode using "$processed_data/Emissions_events_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/facility_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/mean_PM25.dta"
keep if _m != 2
drop _m 

replace EmissionsEvent = 0 if EmissionsEvent == .

replace complaint = 0 if complaint == .
replace count_RN = 0 if count_RN == .

foreach x in tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share {
replace `x' = . if `x' == 0
}

foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
drop if `x' == .
}

gen median_housing_age = 2000 - median_year_built

foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_housing_age median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
egen std_`x' = std(`x')
}

label var std_count_RN "# of Plants"
label var std_mean_PM25 "Average PM2.5 Concentration (2000-2018)"
label var std_border_100km "Border County"
label var std_tot_pop "Total Population in 2000"
label var std_white_share "White Share in 2000"
label var std_black_share "Black Share in 2000"
label var std_other_share "Other Share in 2000"
label var std_hisp_share "Hispanic Share in 2000"
label var std_urban_share "Urban Population Share in 2000"
label var std_pop_density "Population Density in 2000"
label var std_median_hh_inc "Median Household Income in 2000"
label var std_median_housing_age "Median Housing Age in 2000"
label var std_college_share "Share Some College in 2000"    
label var std_LFPR "Laborforce Participation Rate"
label var std_unemployment_rate "Unemployment Rate"
label var std_child_poverty_rate "Child Poverty Rate"
label var std_adult_pov_rate "Adult Poverty Rate"

gen complaints_pc = complaint/tot_pop 
replace complaints_pc = complaints_pc*1000

foreach x in count_RN mean_PM25 border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
reg complaints_pc std_`x', r
est store `x'
}

** PM25, housing age, median rent, urban share, white share al have a partial R2 > 0.01.
gen complaint_pc_IHS = log(complaints_pc + sqrt(1+complaints_pc^2))
replace complaint_pc_IHS = complaint_pc_IHS*100

gen logcomplaint_pc = log(complaints_pc)

foreach x in count_RN mean_PM25 border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate {
reg logcomplaint_pc std_`x', r
est store `x'

}
graph set window fontface "Times New Roman"
coefplot count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate mean_PM25 , transform(* = 100*(exp(@)-1)) xline(0) pstyle(p1) ciopts(recast(rcap)) graphregion(fcolor(white)) drop(_cons) offset(0) mcolor(navy) msize(medsmall) mfcolor(navy) ylabel(,labsize(large)) xlabel(-20(10)20, nogrid labsize(medsmall)) xtitle("Percent Change in the Number of Air" "Complaints per 1,000 people", size(large)) legend(off) ytitle("Standard Deviation Increase in...", size(large)) graphregion(fcolor(255 255 255)) graphregion(lcolor(255 255 255)) legend(off) xlabel(-100(20)100, angle(45) labsize(large)) name(rankchange_bv, replace) xsize(8.6)
graph export "$figures/Figure_D1_Panel_A.pdf", replace 
keep ZipCode count_RN complaints_pc
export delimited "$processed_data/data_for_maps.csv", replace
save "$processed_data/data_for_maps.dta", replace


// Panel B: Socio-Demographic and Economic Predictors of Citizen Complaint For Water
use "$processed_data/unique_complaints.dta", clear

drop if year < 2003
drop if year > 2019

rename ZipCode_CIN ZipCode
rename County_CIN County
*drop if ZipFlag_CIN == 1
gen complaint = Number_Of_Complaints

drop if incident_water == 0
keep if incident_water != .

merge m:1 County using "$processed_data/Border_Counties.dta" 
keep if _m != 2
drop _m 

replace border_exact = 0 if border_exact == .
replace border_100km = 0 if border_100km == .

collapse (sum) complaint (mean) border_exact border_100km , by(ZipCode year)

collapse (mean) complaint border_exact border_100km, by(ZipCode) 

replace border_100km = 1 if border_100km >= 0.5
replace border_100km = 0 if border_100km < 0.5
replace border_exact = 1 if border_exact >= 0.5
replace border_exact = 0 if border_exact < 0.5

merge 1:1 ZipCode using "$processed_data/zipcode_demographics_new.dta"
keep if _m != 1
drop _m

merge 1:1 ZipCode using "$processed_data/Emissions_events_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/facility_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/mean_PM25.dta"
keep if _m != 2
drop _m 

replace EmissionsEvent = 0 if EmissionsEvent == .

replace complaint = 0 if complaint == .
replace count_RN = 0 if count_RN == .

foreach x in tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share {
replace `x' = . if `x' == 0
}

foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
drop if `x' == .
}

gen median_housing_age = 2000 - median_year_built

foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_housing_age median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
egen std_`x' = std(`x')
}

label var std_count_RN "# of Plants"
label var std_EmissionsEvent "# of Emissions Events"
label var std_mean_PM25 "Average PM2.5 Concentration (2000-2018)"
label var std_border_100km "Border County"
label var std_tot_pop "Total Population in 2000"
label var std_white_share "White Share in 2000"
label var std_black_share "Black Share in 2000"
label var std_other_share "Other Share in 2000"
label var std_hisp_share "Hispanic Share in 2000"
label var std_urban_share "Urban Population Share in 2000"
label var std_pop_density "Population Density in 2000"
label var std_avg_hh_size "Avg. Household Size in 2000"
label var std_median_hh_inc "Median Household Income in 2000"
label var std_median_housing_age "Median Housing Age in 2000"
label var std_median_hh_value "Median Housing Value in 2000"
label var std_median_rent "Median Rent in 2000"
label var std_median_rent_share "Median Rent Share in 2000"
label var std_avg_commute_time "Avg. Commute Time in 2000"
label var std_less_HS_share "Share Highschool Dropout in 2000"    
label var std_college_share "Share Some College in 2000"    
label var std_LFPR "Laborforce Participation Rate"
label var std_unemployment_rate "Unemployment Rate"
label var std_child_poverty_rate "Child Poverty Rate"
label var std_adult_pov_rate "Adult Poverty Rate"

gen complaints_pc = complaint/tot_pop 
replace complaints_pc = complaints_pc*1000

foreach x in count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate {
reg complaints_pc std_`x', r
est store `x'
}

gen complaint_pc_IHS = log(complaints_pc + sqrt(1+complaints_pc^2))
replace complaint_pc_IHS = complaint_pc_IHS*100

gen logcomplaint_pc = log(complaints_pc)

foreach x in count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate {
reg logcomplaint_pc std_`x', r
est store `x'
}
graph set window fontface "Times New Roman"
coefplot count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate, transform(* = 100*(exp(@)-1)) xline(0) pstyle(p1) ciopts(recast(rcap)) graphregion(fcolor(white)) drop(_cons) offset(0) mcolor(navy) msize(medsmall) mfcolor(navy) ylabel(,labsize(large)) xlabel(-70(10)30, nogrid labsize(large)) xtitle("Percent Change in the Number of Water" "Complaints per 1,000 people", size(large)) legend(off) ytitle("Standard Deviation Increase in...", size(large)) graphregion(fcolor(255 255 255)) graphregion(lcolor(255 255 255)) legend(off) xlabel(-100(20)100, angle(45) labsize(large)) name(rankchange_bv, replace) xsize(8.6)
graph export "$figures/Figure_D1_Panel_B.pdf", replace 

// Panel C: Socio-Demographic and Economic Predictors of Citizen Complaint For Waste
use "$processed_data/unique_complaints.dta", clear

drop if year < 2003
drop if year > 2019

rename ZipCode_CIN ZipCode
rename County_CIN County
gen complaint = Number_Of_Complaints

drop if incident_waste == 0
keep if incident_waste != .

merge m:1 County using "$processed_data/Border_Counties.dta" 
keep if _m != 2
drop _m 

replace border_exact = 0 if border_exact == .
replace border_100km = 0 if border_100km == .

collapse (sum) complaint (mean) border_exact border_100km , by(ZipCode year)

collapse (mean) complaint border_exact border_100km, by(ZipCode) 

replace border_100km = 1 if border_100km >= 0.5
replace border_100km = 0 if border_100km < 0.5
replace border_exact = 1 if border_exact >= 0.5
replace border_exact = 0 if border_exact < 0.5

merge 1:1 ZipCode using "$processed_data/zipcode_demographics_new.dta"
keep if _m != 1
drop _m

merge 1:1 ZipCode using "$processed_data/Emissions_events_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/facility_count.dta"
keep if _m != 2
drop _m 

merge 1:1 ZipCode using "$processed_data/mean_PM25.dta"
keep if _m != 2
drop _m 


replace EmissionsEvent = 0 if EmissionsEvent == .

replace complaint = 0 if complaint == .
replace count_RN = 0 if count_RN == .

foreach x in tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share {
replace `x' = . if `x' == 0
}


foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
drop if `x' == .
}

gen median_housing_age = 2000 - median_year_built

foreach x in mean_PM25 EmissionsEvent border_100km count_RN tot_pop pop_density avg_hh_size median_hh_inc median_housing_age median_hh_value median_rent median_rent_share avg_commute_time urban_share hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate {
egen std_`x' = std(`x')
}

label var std_count_RN "# of Plants"
label var std_EmissionsEvent "# of Emissions Events"
label var std_mean_PM25 "Average PM2.5 Concentration (2000-2018)"
label var std_border_100km "Border County"
label var std_tot_pop "Total Population in 2000"
label var std_white_share "White Share in 2000"
label var std_black_share "Black Share in 2000"
label var std_other_share "Other Share in 2000"
label var std_hisp_share "Hispanic Share in 2000"
label var std_urban_share "Urban Population Share in 2000"
label var std_pop_density "Population Density in 2000"
label var std_avg_hh_size "Avg. Household Size in 2000"
label var std_median_hh_inc "Median Household Income in 2000"
label var std_median_housing_age "Median Housing Age in 2000"
label var std_median_hh_value "Median Housing Value in 2000"
label var std_median_rent "Median Rent in 2000"
label var std_median_rent_share "Median Rent Share in 2000"
label var std_avg_commute_time "Avg. Commute Time in 2000"
label var std_less_HS_share "Share Highschool Dropout in 2000"    
label var std_college_share "Share Some College in 2000"    
label var std_LFPR "Laborforce Participation Rate"
label var std_unemployment_rate "Unemployment Rate"
label var std_child_poverty_rate "Child Poverty Rate"
label var std_adult_pov_rate "Adult Poverty Rate"

gen complaints_pc = complaint/tot_pop 
replace complaints_pc = complaints_pc*1000

foreach x in count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate {
reg complaints_pc std_`x', r
est store `x'
}

gen complaint_pc_IHS = log(complaints_pc + sqrt(1+complaints_pc^2))
replace complaint_pc_IHS = complaint_pc_IHS*100

gen logcomplaint_pc = log(complaints_pc)

foreach x in count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate {
reg logcomplaint_pc std_`x', r
est store `x'
}
graph set window fontface "Times New Roman"
coefplot count_RN border_100km pop_density median_hh_inc median_housing_age urban_share hisp_share white_share black_share other_share college_share LFPR child_poverty_rate adult_pov_rate, transform(* = 100*(exp(@)-1)) xline(0) pstyle(p1) ciopts(recast(rcap)) graphregion(fcolor(white)) drop(_cons) offset(0) mcolor(navy) msize(medsmall) mfcolor(navy) ylabel(,labsize(large)) xlabel(-20(10)20, nogrid labsize(large)) xtitle("Percent Change in the Number of Waste" "Complaints per 1,000 people", size(large)) legend(off) ytitle("Standard Deviation Increase in...", size(large)) graphregion(fcolor(255 255 255)) graphregion(lcolor(255 255 255)) legend(off) xlabel(-100(20)100, labsize(large) angle(45)) name(rankchange_bv, replace) xsize(8.6)
graph export "$figures/Figure_D1_Panel_C.pdf", replace 